Set the working dir
setwd("/mnt/picea/projects/spruce/akaerkoenen/laser-capture-microdissection-olga")
Load libraries
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(matrixStats))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(vsn))
Source helpers
source("~/Git/UPSCb/src/R/expressionSpecificityUtility.R")
source("~/Git/UPSCb/src/R/plot.multidensity.R")
Load the data
mat <- read.csv("analysis/HTSeq/raw-unormalised_merged-technical-replicates_data.csv",
row.names = 1)
Plotting parameters
mar <- par("mar")
pal <- c(1,2,brewer.pal(8,"Dark2"))
par(mar=c(8.1,4.1,3.1,0.1))
barplot(colSums(mat),las=2,main="number of reads")
boxplot(log10(mat+1),las=2,ylab="log10(counts+1)",main="gene expression")
dge <- DGEList(mat,group = sub("S[1-4]\\.","",colnames(mat)))
dge <- estimateCommonDisp(dge,verbose=TRUE)
## Disp = 1.05166 , BCV = 1.0255
plotBCV(dge)
conditions <- colnames(mat)
dds <- DESeqDataSetFromMatrix(
countData = mat,
colData = data.frame(condition=conditions),
design = ~ condition)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
names(sizes) <- colnames(mat)
pander(sizes)
| S1.ray.cells | S1.xylem.tracheids | S1.whole.sections | S3.ray.cells |
|---|---|---|---|
| 0.2622 | 0.9199 | 0.7349 | 1.771 |
| S3.xylem.tracheids | S3.whole.sections | S2.whole.sections | S4.whole.sections |
|---|---|---|---|
| 1.594 | 1.685 | 1.024 | 0.9086 |
| S2.ray.cells | S2.xylem.tracheids |
|---|---|
| 1.228 | 1.494 |
boxplot(sizes, main="Sequencing libraries size factor")
rld <- rlog(dds)
rld <- assay(rld)
colnames(rld) <- colnames(mat)
rld <- rld[!rowSums(rld==0)==ncol(rld),]
vsd <- varianceStabilizingTransformation(dds)
vsd <- assay(vsd)
colnames(vsd) <- colnames(mat)
dev.null <- sapply(1:ncol(vsd),function(i,r,v){
heatscatter(r[,i],v[match(rownames(r),rownames(v)),i],
xlab="rlog",ylab="vst",main=colnames(v)[i])
},rld,vsd)
plot.multidensity(as.data.frame(rld),
col=pal[as.integer(factor(colnames(rld)))],
legend.x="topright",
legend=levels(factor(colnames(rld))),
legend.col=pal,
legend.lwd=2,
main="sample rlog expression distribution",
xlab="per gene rlog expression (appx. log2)")
plot.multidensity(as.data.frame(vsd),
col=pal[as.integer(factor(colnames(rld)))],
legend.x="topright",
legend=levels(factor(colnames(rld))),
legend.col=pal,
legend.lwd=2,
main="sample vst expression distribution",
xlab="per gene vst expression (appx. log2)")
pc <- prcomp(t(rld))
percent <- round(summary(pc)$importance[2,]*100)
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(colnames(rld)))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("top",pch=19,
col=pal,
legend=levels(factor(colnames(rld))))
The 2nd dimension (accounting 17% of the overall variance) nor the third dimension seem to have a biological explanation.
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(colnames(rld)))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("center",pch=19,
col=pal,
legend=levels(factor(colnames(rld))))
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(sub("\\..*","",colnames(rld))))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
xsp <- expressionSpecificity(vsd-min(vsd),sub("S[1-4]\\.","",colnames(rld)),output = "complete")
## Calculating tissue specificity
tab <- table(xsp[,"n"])
pander(tab)
| 0 | 1 | 2 | 3 |
|---|---|---|---|
| 16913 | 8719 | 7639 | 37465 |
barplot(tab,ylab="genes",xlab="number of tissues", main="gene expression observed accross tissues")
hist(xsp[,"score"],breaks = seq(0,1,0.01),xlab="expression specificity",main="")
hist(xsp[xsp[,"n"] == 1,"score"],breaks = seq(0,1,0.01),xlab="expression specificity",main="")
hist(xsp[xsp[,"n"] == 2,"score"],breaks = seq(0,1,0.01),xlab="expression specificity",main="")
hist(xsp[xsp[,"n"] == 3,"score"],breaks = seq(0,1,0.01),xlab="expression specificity",main="")
plot.multidensity(split(xsp[xsp[,"n"]>0,"maxn"],xsp[xsp[,"n"]>0,"n"]))
boxplot(split(xsp[xsp[,"n"]>0,"maxn"],xsp[xsp[,"n"]>0,"n"]))
sapply(lapply(split.data.frame(xsp[xsp[,"n"]>1,c("maxn","score")],xsp[xsp[,"n"]>1,"n"]),as.matrix),function(o){
heatscatter(o[,1],o[,2])})
## $`2`
## NULL
##
## $`3`
## NULL
pander(colSums(!is.na(xsp[xsp[,"n"] == 1,2:4])))
| ray.cells | whole.sections | xylem.tracheids |
|---|---|---|
| 2571 | 4873 | 1275 |
vsd <- vsd - min(vsd)
mat <- matrix(apply(expand.grid(1:5,1:9),1,function(ro){sum(rowSums(vsd>ro[1])>ro[2])}),
nrow=5,ncol=9)
heatmap.2(mat/sum(rowSums(vsd)>0))
pander(mat/sum(rowSums(vsd)>0))
| 0.6703 | 0.6187 | 0.5837 | 0.553 | 0.5232 | 0.49 | 0.4526 | 0.4003 | 0.2959 |
| 0.5142 | 0.4772 | 0.4506 | 0.4273 | 0.405 | 0.3808 | 0.3522 | 0.3116 | 0.2337 |
| 0.3891 | 0.3577 | 0.3357 | 0.3168 | 0.2991 | 0.2803 | 0.2575 | 0.2279 | 0.1744 |
| 0.2758 | 0.2499 | 0.2325 | 0.2181 | 0.2044 | 0.1906 | 0.1752 | 0.1559 | 0.121 |
| 0.1827 | 0.1642 | 0.1509 | 0.1403 | 0.1305 | 0.121 | 0.1103 | 0.0976 | 0.07644 |
## TODO make the selector a parameter to plot the PCA and a heatmap
dev.null <- apply(expand.grid(1:5,1:9),1,function(ro,sds){
sel <- rowSums(vsd>ro[1])>ro[2]
pc <- prcomp(t(vsd[sel,]))
percent <- round(summary(pc)$importance[2,]*100)
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(sub("S[1-4]\\.","",colnames(rld))))],
pch=19,
main=paste0("Principal Component Analysis (",ro[1],",",ro[2],")"),
sub="variance stabilized counts")
legend("top",pch=19,
col=pal,
legend=levels(factor(sub("S[1-4]\\.","",colnames(rld)))))
#' ### The 2nd and 3rd dims
#' The 2nd dimension (accounting 17% of the overall variance) nor the third
#' dimension seem to have a biological explanation.
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(sub("S[1-4]\\.","",colnames(rld))))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("center",pch=19,
col=pal,
legend=levels(factor(sub("S[1-4]\\.","",colnames(rld)))))
heatmap.2(vsd[sel,][order(sds[sel],decreasing=TRUE)[1:1000],],
labRow = NA,trace = "none",cexCol = 0.6 )
},apply(vsd,1,sd))